import io
import os
import sys
import contextlib
import pandas as pd
import seaborn as sns
from IPython.display import display, Javascript
NOVA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
NOVA_DATA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
os.environ['NOVA_HOME'] = NOVA_HOME
sys.path.insert(1, os.getenv("NOVA_HOME"))
print(f"NOVA_HOME: {os.getenv('NOVA_HOME')}")
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'AlyssaCoyne_new_sorted')
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'ManuscriptFinalData_80pct','AlyssaCoyne_new')
LOGS_PATH = os.path.join(NOVA_HOME, "outputs", "preprocessing", "ManuscriptFinalData_80pct", "AlyssaCoyne_new", "batch1/logs")
PLOT_PATH = os.path.join(NOVA_HOME, 'outputs', 'preprocessing', "ManuscriptFinalData_80pct", 'AlyssaCoyne_new', 'QC_figures')
os.chdir(NOVA_HOME)
from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
plot_cell_count, plot_catplot, plot_hm_combine_batches, \
run_calc_hist_new,plot_hm_of_mean_cell_count_per_tile
from tools.preprocessing_tools.qc_reports.qc_config import AC_panels_new, AC_markers_new, AC_marker_info_new, AC_cell_lines_new, AC_cell_lines_to_cond_new,\
AC_cell_lines_for_disp_new, AC_reps_new, AC_line_colors_new, AC_lines_order_new, AC_custom_palette_new,\
AC_expected_dapi_raw_new
%load_ext autoreload
%autoreload 2
NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA
df = log_files_qc(LOGS_PATH)
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch1 Total of 1 files were read. Before dup handeling (5259, 21) After duplication removal #1: (5259, 22) After duplication removal #2: (5259, 22)
# choose batches
batches = ['batch1']
batches
['batch1']
raws = run_validate_folder_structure(root_directory_raw,
False,
AC_panels_new,
AC_markers_new.copy(),
None,
AC_marker_info_new,
AC_cell_lines_to_cond_new,
AC_reps_new,
AC_cell_lines_for_disp_new,
AC_expected_dapi_raw_new,
batches=batches,
fig_width=8,
expected_count=5,
check_antibody=False)
batch1 Folder structure is valid. No bad files are found. Total Sites: 5286
======== ====================
procs = run_validate_folder_structure(root_directory_proc, True, AC_panels_new, AC_markers_new,None, AC_marker_info_new,
AC_cell_lines_to_cond_new, AC_reps_new, AC_cell_lines_for_disp_new, AC_expected_dapi_raw_new,
batches=batches, fig_width=8,expected_count=5, check_antibody=False)
batch1 Folder structure is valid. No bad files are found. Total Sites: 5216
======== ====================
display_diff(batches, raws, procs, None, fig_width=8)
batch1
========
for batch in batches:
with contextlib.redirect_stdout(io.StringIO()):
var = sample_and_calc_variance(root_directory_proc, batch,
sample_size_per_markers=200, cond_count=1, rep_count=len(AC_reps_new),
num_markers=len(AC_markers_new))
print(f'{batch} var: ',var)
batch1 var: 0.03485630778603039
By order of filtering
Percentage out of the total sites
print('NOT REALLY RELEVANT - RUNNING WITHOUT BRENNER!')
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, AC_line_colors_new, AC_panels_new,
figsize=(6,3), reps=AC_reps_new, vmax=5)
NOT REALLY RELEVANT - RUNNING WITHOUT BRENNER!
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if Cellpose found 0 cells in it.
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner,
AC_line_colors_new, AC_panels_new, figsize=(10,5), reps=AC_reps_new)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if after tiling, no tile is containing at least 85% of a cell that Cellpose detected.
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose,
AC_line_colors_new, AC_panels_new,figsize=(10,5), reps=AC_reps_new)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).
print('NOT REALLY RELEVANT - RUNNING WITHOUT BRENNER!')
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling,
figsize=(10,8), markers=AC_markers_new)
NOT REALLY RELEVANT - RUNNING WITHOUT BRENNER!
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, AC_markers_new)
total_sum.n_valid_tiles.sum() # 68,148
37122
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum() # 11647.0
11648.0
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum() # 55397.0
55404.0
show_total_sum_tables(total_sum)
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch1 | ||||
| count | 1056.000000 | 1056.000000 | 1056.000000 | 1056.000000 |
| mean | 35.153409 | 0.351534 | 40.191288 | 191.840909 |
| std | 12.160115 | 0.121601 | 12.419804 | 52.960553 |
| min | 7.000000 | 0.070000 | 11.000000 | 61.000000 |
| 25% | 26.000000 | 0.260000 | 31.000000 | 148.000000 |
| 50% | 36.000000 | 0.360000 | 40.000000 | 193.000000 |
| 75% | 45.000000 | 0.450000 | 48.000000 | 232.000000 |
| max | 61.000000 | 0.610000 | 86.000000 | 343.000000 |
| sum | 37122.000000 | NaN | 42442.000000 | 202584.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n valid tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| All batches | ||||
| count | 1056.000000 | 1056.000000 | 1056.000000 | 1056.000000 |
| mean | 35.153409 | 0.351534 | 40.191288 | 191.840909 |
| std | 12.160115 | 0.121601 | 12.419804 | 52.960553 |
| min | 7.000000 | 0.070000 | 11.000000 | 61.000000 |
| 25% | 26.000000 | 0.260000 | 31.000000 | 148.000000 |
| 50% | 36.000000 | 0.360000 | 40.000000 | 193.000000 |
| 75% | 45.000000 | 0.450000 | 48.000000 | 232.000000 |
| max | 61.000000 | 0.610000 | 86.000000 | 343.000000 |
| sum | 37122.000000 | NaN | 42442.000000 | 202584.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
For each batch, cell line, replicate and markerTotal number of tiles
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
xlabel = 'Total number of tiles', show_sum=True, figsize=(8,8))
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
For each batch, cell line, replicate and markerTotal number of tiles
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
xlabel = 'Total number of whole cells', show_sum=True, figsize=(8,8))
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, AC_lines_order_new, AC_custom_palette_new, y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)', figsize=(9,4))
plot_cell_count(df_no_empty_sites, AC_lines_order_new, AC_custom_palette_new, y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site',figsize=(9,4))
plot_cell_count(df_no_empty_sites, AC_lines_order_new, AC_custom_palette_new, y='site_cell_count',
title='Cellpose Cell Count Average per Site',figsize=(9,4))
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
cell_line_cond C9-CS2YNL 9.188525 C9-CS7VCZ 7.125000 C9-CS8RFT 8.700000 Ctrl-EDi022 7.658333 Ctrl-EDi029 6.400000 Ctrl-EDi037 4.366667 SALSNegative-CS0ANK 6.850000 SALSNegative-CS0JPP 2.750000 SALSNegative-CS6ZU8 6.175000 SALSPositive-CS2FN3 8.941667 SALSPositive-CS4ZCD 6.050000 SALSPositive-CS7TN6 9.716667 Name: n_valid_tiles, dtype: float64
plot_catplot(df_dapi,
sns.color_palette('colorblind'),
reps=AC_reps_new,
x='cell_line',
y_title='Valid Tiles Count',
x_title='Cell Line',
y='n_valid_tiles',
hue='rep',
height=5,
aspect=5)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1061: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep'] /home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1074: UserWarning: The palette list has more values (10) than needed (2), which may not be intended. g = sns.catplot(kind='box', data=df, y=y, x=x,height=height, aspect=aspect, hue=hue, palette=palette,
plot_hm_of_mean_cell_count_per_tile(df_dapi, split_by=None,rows='cell_line', columns='rep')
# for batch in batches:
# print(batch)
# run_calc_hist_new(batch,AC_cell_lines_for_disp, AC_markers, root_directory_raw, root_directory_proc,
# hist_sample=10,sample_size_per_markers=10, ncols=4, nrows=1, figsize=(6,2))
# print("="*30)
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html {NOVA_HOME}/tools/preprocessing_tools/qc_reports/qc_report_CoyneLab_new.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/ManuscriptFinalData/qc_report_CoyneLab_new.html')